# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>

# <codecell>

# run in shell by command "python spider.py"
import urllib2
import sqlite3
from BeautifulSoup import BeautifulSoup
from hashlib import md5

class spider:
    def __init__(self,url,keyword):  
        self.keyword = keyword
        self.urldoc = [] #链接与内容关联
        self.urldoc.append([url,' '])
        self.urlcheck = {} #链接是否遍历过，url为key，bool值为value
        self.start()
    
    def start(self):
        url,doc = self.urldoc[0]
        self.urldoc.remove(self.urldoc[0]) #取队首
        if doc.find(self.keyword)>-1:
            print 'OK,found the keyword "'+self.keyword+'" in "' + doc +' " '+url
            return 
        if self.urlcheck.get(url,False)==False: #未遍历过
            self.urlcheck[url]=True
            self.getDoc(url)
            
        self.start()
            

    def getDoc(self,url):
        print url
        try :
            html = BeautifulSoup(urllib2.urlopen(url).read())
            for link in html.findAll('a'):
                href = link.get('href')
                doc = link.getText()
                self.urldoc.append([href,doc])        
        except :
            pass
        return

url ="http://news.163.com"
#keyword = u'华南'
keyword = raw_input('输入关键字:')
keyword = unicode(keyword,'utf-8')
s = spider(url,keyword)
            

# <codecell>


